import re
import string
def text_temizle1(text):
#kucuk harf yap
text = text.lower()
#parantez içindeki yazıları sil
text = re.sub('\[.*?\]', '', text)
#noktalama işaretlerini sil
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#sayıları sil
text = re.sub('\w*\d\w*', '', text)
#3 karakterden kısa olan sözcükleri sil türkçe karakterlerle birlikte
text = re.sub(r'\b[a-zğüşöçıİĞÜŞÖÇ]{1,2}\b', r'',text)
return text
import tweepy
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')
access_token = "4643784988-lVqQxcJqbSz3nf1C2QsfEqClnh1gDAiur44qhXx"
access_token_secret = "s1ronrb01KE9ezxArFSVgeUMgvGFq6mOh9c9HsKuYKlvJ"
consumer_key = "e4hDcQIzGTKewbysmz2TbYIZd"
consumer_secret= "t6vx97uZaG7FmAcCuzVB1MdeKbWFxJ3ucCmHwFtytLlo2K11NE"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.tokenize import sent_tokenize, word_tokenize
import requests
import numpy as np
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import remove_stopwords
class TwitterSentimentAnalyser:
def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, keyword, tweetCount):
self.keyword = keyword
self.consumer_key = consumer_key
self.consumer_secret = consumer_secret
self.access_token = access_token
self.access_token_secret = access_token_secret
self.tweetCount = tweetCount
def getTwitterData(self):
tweets_list = []
for tweet in api.search_tweets(q=self.keyword, count=self.tweetCount, lang='en-en'):
tweets_list.append((tweet.created_at,tweet.id,tweet.text))
self.tweets = pd.DataFrame(tweets_list, columns=["Tarih",'Tweet_id','Text'])
twst = TwitterSentimentAnalyser(consumer_key = consumer_key, consumer_secret = consumer_secret,
access_token = access_token, access_token_secret = access_token_secret,
keyword='corona', tweetCount=1000)
twst.getTwitterData()
twst.tweets.head()
data = twst.tweets.copy()
def preProcess(ReviewText):
#Verideki <br> taglarını kaldır.
ReviewText = ReviewText.str.lower()
ReviewText = ReviewText.str.replace("(rt)", "")
ReviewText = ReviewText.str.replace("(retweet)", "")
ReviewText = ReviewText.str.replace("(<br/>)", "")
ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
ReviewText = ReviewText.str.replace('(&)', '')
ReviewText = ReviewText.str.replace('(>)', '')
ReviewText = ReviewText.str.replace('(<)', '')
ReviewText = ReviewText.str.replace('(\xa0)', ' ')
#Verideki Linkleri Kaldır.
ReviewText = ReviewText.str.replace(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', ' ')
return ReviewText
data['Text'] = preProcess(data['Text'])
#import nltk.stem as stemmer
from tqdm import tqdm
import socket
from TurkishStemmer import TurkishStemmer
stemmer = TurkishStemmer()
stemmed_lists = []
for index in tqdm(data.index):
mini_l = []
for text in data.loc[index]['Text'].split(" "):
mini_l.append(stemmer.stem(text))
big_text=" "
for char in mini_l:
big_text = big_text + " " + char
stemmed_lists.append(big_text)
data['stemmed'] = stemmed_lists
data.head()
100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 2179.80it/s]
| Tarih | Tweet_id | Text | stemmed | |
|---|---|---|---|---|
| 0 | 2022-01-23 15:28:39+00:00 | 1485273278549794824 | @argonerd @joerncarmaker saying from a person ... | @argonerd @joerncarmaker saying from a perso... |
| 1 | 2022-01-23 15:28:39+00:00 | 1485273278189281283 | @hindisyahi: 6 yrs me ek exam liye hain aur a... | @hindisyahi: 6 yrs me ek exam li hai aur ab... |
| 2 | 2022-01-23 15:28:35+00:00 | 1485273261801971723 | @greenswelfares: protection from the virus- m... | @greenswelfares: protection from the virus-... |
| 3 | 2022-01-23 15:28:35+00:00 | 1485273260728233988 | @clacaeuropa: is it widely known in the uk th... | @clacaeuropa: is it widely known in the uk ... |
| 4 | 2022-01-23 15:28:33+00:00 | 1485273255917326336 | @usmoality: 🚨 just in: new study by german pr... | @usmoality: 🚨 just in: new study by german ... |
data.head()
def get_top_n_words(corpus, n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize
eng_stopwords = stopwords.words('english')
def remove_mystopwords(data):
tokens = sentence.split(" ")
tokens_filtered= [word for word in text_tokens if not word in my_stopwords]
return (" ").join(tokens_filtered)
print(data)
Tarih Tweet_id \
0 2022-01-23 15:28:39+00:00 1485273278549794824
1 2022-01-23 15:28:39+00:00 1485273278189281283
2 2022-01-23 15:28:35+00:00 1485273261801971723
3 2022-01-23 15:28:35+00:00 1485273260728233988
4 2022-01-23 15:28:33+00:00 1485273255917326336
.. ... ...
95 2022-01-23 15:24:04+00:00 1485272124021518340
96 2022-01-23 15:23:55+00:00 1485272088038772737
97 2022-01-23 15:23:50+00:00 1485272067159339013
98 2022-01-23 15:23:50+00:00 1485272067096387585
99 2022-01-23 15:23:49+00:00 1485272062340272135
Text \
0 @argonerd @joerncarmaker saying from a person ...
1 @hindisyahi: 6 yrs me ek exam liye hain aur a...
2 @greenswelfares: protection from the virus- m...
3 @clacaeuropa: is it widely known in the uk th...
4 @usmoality: 🚨 just in: new study by german pr...
.. ...
95 @cbsphilly don’t stray to far off campus. coro...
96 @dineshdsouza the lady is absolutely right. fe...
97 @shaneparr_ @alicebellamy @mitrebarnet @andrew...
98 @covidlive: 41,428 new cases and 20 new death...
99 @si_lv_er: apa from corona, these are depress...
stemmed
0 @argonerd @joerncarmaker saying from a perso...
1 @hindisyahi: 6 yrs me ek exam li hai aur ab...
2 @greenswelfares: protection from the virus-...
3 @clacaeuropa: is it widely known in the uk ...
4 @usmoality: 🚨 just in: new study by german ...
.. ...
95 @cbsphilly don’t stray to far off campus. co...
96 @dineshdsouza the lady is absolutely right. ...
97 @shaneparr_ @alicebellamy @mitrebarnet @andr...
98 @covidlive: 41,428 new cases and 20 new dea...
99 @si_lv_er: ap from corona, thes are depress...
[100 rows x 4 columns]
data_clean = pd.DataFrame(data.Text.apply(text_temizle1))
data_clean
| Text | |
|---|---|
| 0 | argonerd joerncarmaker saying from person che... |
| 1 | hindisyahi yrs exam liye hain aur abhi cor... |
| 2 | greenswelfares protection from the virus mask... |
| 3 | clacaeuropa widely known the that decemb... |
| 4 | usmoality 🚨 just new study german prof kuhb... |
| ... | ... |
| 95 | cbsphilly don’ stray far off campus corona t... |
| 96 | dineshdsouza the lady absolutely right fear ... |
| 97 | shaneparr alicebellamy mitrebarnet andrewleedr... |
| 98 | covidlive new cases and new deaths germany... |
| 99 | silver apa from corona these are depressing t... |
100 rows × 1 columns
from sklearn.feature_extraction.text import CountVectorizer
common_words = get_top_n_words(data_clean['Text'], 25)
df2 = pd.DataFrame(common_words, columns = ['kelime' , 'geçiş frekansı'])
fig = plt.figure(figsize=[25,5])
ax = fig.add_axes([0,0,1,1])
ax.bar(df2.kelime,df2['geçiş frekansı'])
plt.title('Kök Bulmadan En Çok Geçen Kelimeler')
plt.show()
def red_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None):
return "#85bb65"
text = " ".join(review for review in data.Text)
wordcloud = WordCloud().generate(text)
# Generate a wordcloud
wordcloud.generate(text)
wordcloud.to_file("wordcloud.png")
# show
plt.figure(figsize=[200,100])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()